experiment_name <- "fveg_1-18_gb"
experiment_path <- here("data", "4_for_analysis", "ML_outputs", "experiments", experiment_name)
dataset <- "fveg"
months_ts <- TRUE
# read in the fallow fields predictions
fallow <- fread(here(experiment_path, "fallow.csv"))
fallow[fallow == -9999] <- NA # this is the na value
if (months_ts){
# get a nice column of numeric months
months <- select(fallow, names(fallow)[grepl("month", names(fallow))])
fallow$month <- names(months)[max.col(months)]
fallow$month <- str_extract(fallow$month, '(?<=_)\\d+') %>% as.numeric()
}
if (months_ts){
# with different months
r2 <- summary(lm(ET~ET_pred, data=fallow))$r.squared
bias <- mean(fallow$ET_pred - fallow$ET, na.rm=TRUE)
ggplot(fallow) +
geom_jitter(aes(x=ET_pred, y=ET, color=as.factor(month)), alpha=0.2, size =.1) +
theme_classic() +
geom_abline(intercept=0,slope=1, color="red") +
annotate("text", x=4*30, y=1.5, label= paste("R2:", round(r2, 3), "Bias:", round(bias, 3))) +
xlab("Predicted natural ET (mm/month)") +
ylab("Observed fallow ET (mm/month)")
}
if (months_ts){
# averaging over different months
fallow_year <- fallow %>% group_by(x, y) %>% summarize(ET = mean(ET, na.rm=TRUE),
ET_pred = mean(ET_pred, na.rm=TRUE))
} else {
fallow_year <- fallow
}
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
r2 <- summary(lm(ET~ET_pred, data=fallow_year))$r.squared
bias <- mean(fallow_year$ET_pred - fallow_year$ET, na.rm=TRUE)
ggplot(fallow_year) +
geom_jitter(aes(x=ET_pred, y=ET), alpha=0.2, size =.1) +
theme_classic() +
geom_abline(intercept=0,slope=1, color="red") +
annotate("text", x=3*30, y=1.5, label= paste("R2:", round(r2, 3), "Bias:", round(bias, 3))) +
xlab("Predicted natural ET (mm/month)") +
ylab("Observed fallow ET (mm/month)")
if (months_ts){
# plot out the time series
fallow_ts <- fallow %>% group_by(month) %>% summarize(ET = mean(ET, na.rm=TRUE),
ET_pred = mean(ET_pred, na.rm=TRUE))
ggplot(fallow_ts) +
geom_line(aes(x=month, y=ET), color="red") +
geom_line(aes(x=month, y=ET_pred), color="black")
}
Here I just got curious about what ag ET vs fallow ET look like. Seems like fallow ET is probably artificially high.
# ag <- fread(here("data", "3_for_counterfactual", "agriculture", "agriculture.csv"))
# ag$month <- str_extract(ag$month, '\\b\\w+$') %>% as.numeric()
#
# # plot out the time series
# ag_ts <- ag %>% group_by(month) %>% summarize(ET = mean(ET, na.rm=TRUE))
#
# ggplot(ag_ts) +
# geom_line(aes(x=month, y=ET), color="red") +
# geom_line(data=fallow_ts, aes(x=month, y=ET), color="black")
# read in the spatial_CV predictions
cv <- fread(here(experiment_path, "crossval_predictions_train.csv"))
cv[cv == -9999] <- NA # this is the na value
if (months_ts){
# get a nice column of numeric months
months <- select(cv, names(cv)[grepl("month", names(cv))])
cv$month <- names(months)[max.col(months)]
cv$month <- str_extract(cv$month, '(?<=_)\\d+') %>% as.numeric()
}
cv$mean_dist <- cv$fold_size/6 # the average distance between a held out point and the border to the hold out
if (months_ts){
gridded_cv_metrics <- cv %>%
group_by(x, y, cv_fold, fold_size) %>%
summarize(ET = mean(ET, na.rm = TRUE),
ET_pred = mean(ET_pred, na.rm = TRUE)) %>%
group_by(cv_fold, fold_size) %>%
summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2),
bias = mean(ET_pred - ET, na.rm = TRUE),
rmse = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2),
mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
ET = mean(ET, na.rm = TRUE),
ET_pred = mean(ET_pred, na.rm = TRUE))
} else {
gridded_cv_metrics <- cv %>%
group_by(cv_fold, fold_size) %>%
summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2),
bias = mean(ET_pred - ET, na.rm = TRUE),
rmse = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2),
mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
ET = mean(ET, na.rm = TRUE),
ET_pred = mean(ET_pred, na.rm = TRUE))
}
## `summarise()` has grouped output by 'x', 'y', 'cv_fold'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cv_fold'. You can override using the `.groups` argument.
Turn the cv fold back into something meaningful
# first turn "fold" into x and y coordinates
gridded_cv_metrics$x <- as.numeric(str_extract(gridded_cv_metrics$cv_fold, "[-.0-9]*(?=,)"))
gridded_cv_metrics$y <- as.numeric(str_extract(gridded_cv_metrics$cv_fold, "(?<=,)[-.0-9]*"))
# study area and counties for plotting
study_area <- st_read(study_area_loc) %>% st_transform(st_crs("+proj=longlat +datum=WGS84"))
## Reading layer `study_area' from data source
## `/home/waves/users/annaboser/ET_ag_disALEXI/data/1_raw/study_area/study_area.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 3 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -218673.3 ymin: -352229.6 xmax: 135464.2 ymax: 256511.5
## Projected CRS: NAD83 / California Albers
counties <- st_read(counties_loc) %>% filter(STATEFP == "06") %>% st_transform(st_crs("+proj=longlat +datum=WGS84"))
## Reading layer `cb_2018_us_county_500k' from data source
## `/home/waves/users/annaboser/ET_ag_disALEXI/data/1_raw/counties/cb_2018_us_county_500k.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 3233 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -179.1489 ymin: -14.5487 xmax: 179.7785 ymax: 71.36516
## Geodetic CRS: NAD83
filter(gridded_cv_metrics, fold_size !=1) %>%
ggplot() +
geom_sf(data = counties, color=alpha("white",1), size = .2) +
geom_raster(aes(x=x, y=y, fill=rmse)) +
facet_wrap(vars(fold_size/1000)) +
scale_fill_distiller(palette="Spectral", direction = -1) + #, limits = c(0,2)) +
geom_sf(data = study_area, fill=alpha("red",0), color = "black", size = .2) +
theme_classic() +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
## Warning: Raster pixels are placed at uneven horizontal intervals and will be
## shifted. Consider using geom_tile() instead.
## Warning: Raster pixels are placed at uneven vertical intervals and will be
## shifted. Consider using geom_tile() instead.
First get the distribution of distances between ag and natural data (specifically the test data)
dist_filename <- here(distance_distribution_path, paste0(dataset, ".csv"))
if (file.exists(dist_filename)){
library(parallel)
library(tictoc)
dist <- fread(dist_filename)
} else {
stop('No distance file for this dataset. Make one using 3_analysis/1_additional_data_manipulation/2_ag_natural_distance.R')
}
##
## Attaching package: 'tictoc'
## The following object is masked from 'package:data.table':
##
## shift
if (months_ts){
cv_metrics <- cv %>%
group_by(x, y, fold_size, mean_dist) %>%
summarize(ET = mean(ET, na.rm = TRUE),
ET_pred = mean(ET_pred, na.rm = TRUE)) %>%
group_by(fold_size, mean_dist) %>%
summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2),
bias = mean(ET_pred - ET, na.rm = TRUE),
rmse = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2),
mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
ET = mean(ET, na.rm = TRUE),
ET_pred = mean(ET_pred, na.rm = TRUE))
} else {
cv_metrics <- cv %>%
group_by(fold_size, mean_dist) %>%
summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2),
bias = mean(ET_pred - ET, na.rm = TRUE),
rmse = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2),
mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
ET = mean(ET, na.rm = TRUE),
ET_pred = mean(ET_pred, na.rm = TRUE))
}
## `summarise()` has grouped output by 'x', 'y', 'fold_size'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'fold_size'. You can override using the `.groups` argument.
coef = 2/5
ggplot(NULL) +
stat_density(data = dist, aes(x=mindist/1000), fill = "grey10", alpha = .3) +
geom_line(data = cv_metrics, aes(x=mean_dist/1000, y = r2*coef), color = "red") +
geom_point(data = cv_metrics, aes(x=mean_dist/1000, y = r2*coef), color = "red") +
xlab(TeX("Distance of agricultural pixel to nearest natural pixel (km) \n Average distance of point to border of hold-out (km)")) +
theme_bw() +
theme(axis.title.x = element_text(vjust=-2.5)) +
scale_y_continuous(
name = "Distribution of agricultural pixels",
sec.axis = sec_axis(~./coef, name=TeX("Model performance ($R^{2}$)")))
if (months_ts){
cv_metrics <- cv %>%
group_by(fold_size, month, mean_dist) %>%
summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2),
bias = mean(ET_pred - ET, na.rm = TRUE),
rmse = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2),
mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
ET = mean(ET, na.rm = TRUE),
ET_pred = mean(ET_pred, na.rm = TRUE))
coef = 2/5
ggplot(NULL) +
stat_density(data = dist, aes(x=mindist/1000), fill = "grey10", alpha = .3) +
geom_line(data = cv_metrics, aes(x=mean_dist/1000, y = r2*coef, color = as.factor(month))) +
geom_point(data = cv_metrics, aes(x=mean_dist/1000, y = r2*coef, color = as.factor(month))) +
xlab(TeX("Distance of agricultural pixel to nearest natural pixel (km) \n Average distance of point to border of hold-out (km)")) +
theme_bw() +
theme(axis.title.x = element_text(vjust=-2.5)) +
scale_y_continuous(
name = "Distribution of agricultural pixels",
sec.axis = sec_axis(~./coef, name=TeX("Model performance ($R^{2}$)")))
}
## `summarise()` has grouped output by 'fold_size', 'month'. You can override using
## the `.groups` argument.
Make these using the CV with the fold size that is closest to the double of the mean distance between ag and natural (a bit conservative but only slightly)
mean_dist <- mean(dist$mindist)
folds <- unique(cv$mean_dist)
best_fold <- folds[abs(folds-(mean_dist)) == min(abs(folds-(mean_dist)))]
cv_best_fold <- filter(cv, mean_dist == best_fold)
print(cv_best_fold)
## x y Agriculture FVEG CPAD CDL Elevation Slope
## 1: -122.2117 40.30450 NA 1 NA NA 120.4524 0.01331826
## 2: -122.2104 40.30450 NA 1 NA 1 121.0478 0.02825651
## 3: -122.2079 40.30450 NA 1 NA 1 120.1848 0.02384662
## 4: -122.2073 40.30450 NA 1 NA 1 119.2702 0.01329535
## 5: -122.2060 40.30450 NA 1 NA 1 119.2642 0.01675222
## ---
## 13568658: -118.9116 34.84324 NA 1 NA 1 1308.4076 0.35483694
## 13568659: -118.9103 34.84324 NA 1 NA NA 1288.0094 0.06948160
## 13568660: -118.9090 34.84324 NA 1 NA NA 1293.0698 0.27559921
## 13568661: -118.9084 34.84324 NA 1 NA NA 1288.1407 0.30591771
## 13568662: -118.9116 34.84261 NA 1 NA 1 1332.6322 0.38480535
## Aspect Soil TWI year ET PET month_1
## 1: 6.2714238 0.14174530 18.78722 2021 27.000000 0.9591289 1
## 2: 5.5350285 0.29090971 18.08801 2021 30.000000 0.9596014 1
## 3: 1.0666770 0.15410967 18.03865 2021 29.829855 0.9605466 1
## 4: 1.0363326 0.17099379 18.56828 2021 31.212646 0.9607829 1
## 5: 6.1137919 0.15492515 18.22674 2021 32.490223 0.9612554 1
## ---
## 13568658: 0.8541588 0.01784754 15.74251 2021 7.157592 1.0310227 0
## 13568659: 2.7958539 0.07963096 16.72223 2021 6.802347 1.0306276 0
## 13568660: 3.0064828 0.23268874 15.94000 2021 2.549413 1.0302325 0
## 13568661: 2.6084707 0.24122863 15.90047 2021 2.098826 1.0300349 0
## 13568662: 1.1571892 0.06803262 15.57051 2021 4.838519 1.0311517 0
## month_2 month_3 month_4 month_5 month_6 month_7 month_8 month_9
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## ---
## 13568658: 0 0 0 0 0 0 0 0
## 13568659: 0 0 0 0 0 0 0 0
## 13568660: 0 0 0 0 0 0 0 0
## 13568661: 0 0 0 0 0 0 0 0
## 13568662: 0 0 0 0 0 0 0 0
## month_10 month_11 month_12 fold_size
## 1: 0 0 0 10000
## 2: 0 0 0 10000
## 3: 0 0 0 10000
## 4: 0 0 0 10000
## 5: 0 0 0 10000
## ---
## 13568658: 0 0 1 10000
## 13568659: 0 0 1 10000
## 13568660: 0 0 1 10000
## 13568661: 0 0 1 10000
## 13568662: 0 0 1 10000
## cv_fold ET_pred month mean_dist
## 1: -122.24719101123596,40.27027027027027 25.264435 1 1666.667
## 2: -122.24719101123596,40.27027027027027 24.291529 1 1666.667
## 3: -122.24719101123596,40.27027027027027 23.829910 1 1666.667
## 4: -122.24719101123596,40.27027027027027 24.255668 1 1666.667
## 5: -122.24719101123596,40.27027027027027 25.264435 1 1666.667
## ---
## 13568658: -118.98876404494382,34.77477477477478 21.539060 12 1666.667
## 13568659: -118.98876404494382,34.77477477477478 12.021730 12 1666.667
## 13568660: -118.98876404494382,34.77477477477478 9.390516 12 1666.667
## 13568661: -118.98876404494382,34.77477477477478 9.186157 12 1666.667
## 13568662: -118.98876404494382,34.77477477477478 15.883162 12 1666.667
if (months_ts){
# with different months
r2 <- summary(lm(ET~ET_pred, data=cv_best_fold))$r.squared
bias <- mean(cv_best_fold$ET_pred - cv_best_fold$ET, na.rm=TRUE)
ggplot(cv_best_fold) +
geom_jitter(aes(x=ET_pred, y=ET, color=as.factor(month)), alpha=0.2, size =.1) +
theme_classic() +
geom_abline(intercept=0,slope=1, color="red") +
# annotate("text", x=4.5, y=1.5, label= paste("R2:", round(r2, 3), "Bias:", round(bias, 3))) +
xlab("Predicted natural ET (mm/month)") +
ylab("Observed fallow ET (mm/month)")
}
print(paste("R2:", round(r2, 3), "Bias:", round(bias, 3)))
## [1] "R2: 0.642 Bias: 0.403"
if (months_ts){
# averaging over different months
cv_best_fold_year <- cv_best_fold %>% group_by(x, y) %>% summarize(ET = mean(ET, na.rm=TRUE),
ET_pred = mean(ET_pred, na.rm=TRUE))
} else {
cv_best_fold_year <- cv_best_fold
}
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
r2 <- summary(lm(ET~ET_pred, data=cv_best_fold_year))$r.squared
bias <- mean(cv_best_fold_year$ET_pred - cv_best_fold_year$ET, na.rm=TRUE)
ggplot(cv_best_fold_year) +
# stat_density_2d(aes(x=ET_pred, y=ET,fill = stat(density)), geom = 'raster', contour = FALSE) +
# scale_fill_viridis_c() +
geom_jitter(aes(x=ET_pred, y=ET), alpha=0.01, size =.01, color = 'black') +
theme(legend.position="none") +
theme_classic() +
geom_abline(intercept=0,slope=1, color="red") +
# annotate("text", x=4.5*30, y=1.5, label= paste("R2:", round(r2, 3), "Bias:", round(bias, 3))) +
xlab("Predicted natural ET (mm/month)") +
ylab("Observed fallow ET (mm/month)")
print(paste("R2:", round(r2, 3), "Bias:", round(bias, 3)))
## [1] "R2: 0.651 Bias: 0.403"
if (months_ts){
# plot out the time series
cv_best_fold_ts <- cv_best_fold %>% group_by(month) %>% summarize(ET = mean(ET, na.rm=TRUE),
ET_pred = mean(ET_pred, na.rm=TRUE))
ggplot(cv_best_fold_ts) +
geom_line(aes(x=month, y=ET), color="red") +
geom_line(aes(x=month, y=ET_pred), color="black") +
geom_line(data = fallow_ts, aes(x=month, y=ET), color="blue") +
geom_line(data = fallow_ts, aes(x=month, y=ET_pred), color="green")
}